import re
import os
from sklearn.externals import joblib
from sklearn.preprocessing import OneHotEncoder
import numpy as np

# 2::3124::Agnes Browne (1999)::Comedy|Drama::5493::M::35::12
SOURCE_DATA_PATH = "../../data/ml-1m_20190508/TRAIN_20190506.csv"
SOURCE_OH_MODEL_PATH = "../../data/ml-1m_20190508/oh_encoder_20190509.1"

if not os.path.exists(SOURCE_OH_MODEL_PATH):
    os.makedirs(SOURCE_OH_MODEL_PATH)


def get_year(movie):
    p1 = re.compile(r'[(](.*?)[)]', re.S)
    res = re.findall(p1, movie)
    if len(res) > 0:
        return int(res[-1].strip())
    else:
        return 0


def cal_count(list1):
    count_tmp = {}
    for item in list1:
        if item in count_tmp:
            count_tmp[item] = count_tmp[item] + 1
        else:
            count_tmp[item] = 1
    return count_tmp


data = {}
data["scores"] = []
data["movie_id"] = []
data["movie_year"] = []
data["movie_type"] = []
data["user_id"] = []
data["user_gentle"] = []
data["user_age"] = []
data["user_occupation"] = []

with open(SOURCE_DATA_PATH, encoding = "utf8") as f:
    idx = 0
    for line in f:
        if idx == 0:
            idx = 1
            continue
        ll = line.strip().split("::")
        data["scores"].append([ll[0]])
        data["movie_id"].append([ll[1]])
        data["movie_year"].append([get_year(ll[2])])
        for item in ll[3].split("|"):
            data["movie_type"].append([item])
        data["user_id"].append([ll[4]])
        data["user_gentle"].append([ll[5]])
        data["user_age"].append([ll[6]])
        data["user_occupation"].append([ll[7]])

# 逐一one-hot化
oh_encoder = {}
for keys in data:
    oh_encoder[keys] = OneHotEncoder(sparse=False)
    oh_encoder[keys].fit(data[keys])
# print(oh_encoder["movie_type"].categories_)
# print(score_oh.transform([["3"]]))

# 逐一保存模型
for keys in oh_encoder:
    joblib.dump(oh_encoder[keys], "%s/%s.model" % (SOURCE_OH_MODEL_PATH, keys))
